1 Carga de librerias

library(tidyverse)
library(gt)
library(kableExtra)
library(gtsummary)
library(corrplot)
library(car)
library(caret)
library(pROC)

2 Carga de dato

library(readxl)
file_path <- "/home/guido/Downloads/CogED xDiag xVero.xlsx"
dataAD <- read_excel(file_path, sheet = "CogEd AD", range = "B1:R478")



dataCN <- read_excel(file_path, sheet = "CogEd CN", range = "A1:Q669")

dataFTD <- read_excel(file_path, sheet = "CogEd FTD", range = "A1:Q274")

cat("dataAD:",  dim(dataAD),  "\n")
## dataAD: 477 17
cat("dataCN:",  dim(dataCN),  "\n")
## dataCN: 668 17
cat("dataFTD:", dim(dataFTD), "\n")
## dataFTD: 273 17

3 Limpieza de variables

data <- rbind(dataAD, dataCN, dataFTD)
data$clinical_diagnosis <- as.factor(data$clinical_diagnosis)
data$cog_tmt_a_err <- as.numeric(ifelse(data$cog_tmt_a_err=="completed", 0, data$cog_tmt_a_err))
data$cog_tmt_a_corr <- as.numeric(ifelse(data$cog_tmt_a_corr=="completed", 25, data$cog_tmt_a_corr))
data$cog_tmt_b_err <- as.numeric(ifelse(data$cog_tmt_b_err=="completed", 0, data$cog_tmt_b_err))
data$cog_tmt_b_corr <- as.numeric(ifelse(data$cog_tmt_b_corr=="completed", 25, data$cog_tmt_b_corr))
data$cog_digits_forward_span <- as.numeric(ifelse(data$cog_digits_forward_span=="completed", 8, data$cog_digits_forward_span))
data$cog_digits_backward_span <- as.numeric(ifelse(data$cog_digits_backward_span=="completed", 7, data$cog_digits_backward_span))
data$cog_digits_forward_total <- ifelse(data$cog_digits_forward_total>16, NA, data$cog_digits_forward_total)
data$cog_digits_backward_total <- ifelse(data$cog_digits_backward_total>14, NA, data$cog_digits_backward_total)
data$cog_tmt_a <- ifelse(data$cog_tmt_a>150, NA, data$cog_tmt_a)
data$cog_tmt_b <- ifelse(data$cog_tmt_b>300, NA, data$cog_tmt_b)

4 Distribución de valores

plot_col_hist <- function(data, title,column) {
  hist(data[[column]],
       main = title,
       xlab = "Años",
       ylab = "Frecuencia",
       col = "lightblue",
       breaks = 20)
}

plot_col_hist(data,   "Años de educación","cog_ed")

plot_col_hist(dataAD, "Años de educación AD","cog_ed")

plot_col_hist(dataCN, "Años de educación CN","cog_ed")

plot_col_hist(dataFTD, "Años de educación FTD","cog_ed")

plot_col_hist(data, "cog_digits_forward_total","cog_digits_forward_total")

plot_col_hist(data, "cog_digits_backward_total","cog_digits_backward_total")

plot_col_hist(data, "mmse_total","mmse_total")

plot_col_hist(data, "cog_category_animals","cog_category_animals")

plot_col_hist(data, "cog_category_vegetables","cog_category_vegetables")

plot_col_hist(data, "cog_tmt_a","cog_tmt_a")

plot_col_hist(data, "cog_tmt_b","cog_tmt_b")

data <- data %>%
  select(-codMP, -id_paciente) %>%
  mutate(tests_no_hechos = rowSums(is.na(across(c(cog_digits_forward_total, cog_category_animals, cog_category_vegetables, cog_tmt_a, cog_tmt_b)))))

5 Descarte de pacientes con 5 tests no hechos

data <- data %>%
  filter(tests_no_hechos < 5)

6 Tabla 1 (considerando 3 categorias)

tabla1 <- 
  tbl_summary(
    data = data,
    by = clinical_diagnosis,          # para comparar grupos
    include = c(                      # 👈 incluimos solo estas variables
      cog_ed,
      cog_digits_forward_total,
      cog_digits_forward_span,
      cog_digits_backward_total,
      cog_digits_backward_span,
      mmse_total,
      cog_category_animals,
      cog_category_vegetables,
      cog_tmt_a,
      cog_tmt_a_corr,
      cog_tmt_a_err,
      cog_tmt_b,
      cog_tmt_b_corr,
      cog_tmt_b_err
    ),
    missing = "ifany",
    missing_text = "No realizado",
    label = list(
      cog_ed ~ "Años de educación",
      cog_digits_forward_total  ~ "Digits forward total",
      cog_digits_forward_span   ~ "Digits forward span",
      cog_digits_backward_total ~ "Digits backwards total",
      cog_digits_backward_span  ~ "Digits backwards span",
      mmse_total                ~ "Mini mental",
      cog_category_animals      ~ "Fluidez semántica (animales)",
      cog_category_vegetables   ~ "Fluidez semántica (vegetales)",
      cog_tmt_a       ~ "Atención sostenida (tiempo)",
      cog_tmt_a_corr  ~ "Atención sostenida (correctas)",
      cog_tmt_a_err   ~ "Atención sostenida (errores)",
      cog_tmt_b       ~ "Funciones ejecutivas (tiempo)",
      cog_tmt_b_corr  ~ "Funciones ejecutivas (correctas)",
      cog_tmt_b_err   ~ "Funciones ejecutivas (errores)"
    ),
    type = list(                     # forzamos todas como continuas
      cog_ed ~ "continuous",
      cog_digits_forward_total  ~ "continuous",
      cog_digits_forward_span   ~ "continuous",
      cog_digits_backward_total ~ "continuous",
      cog_digits_backward_span  ~ "continuous",
      mmse_total                ~ "continuous",
      cog_category_animals      ~ "continuous",
      cog_category_vegetables   ~ "continuous",
      cog_tmt_a       ~ "continuous",
      cog_tmt_a_corr  ~ "continuous",
      cog_tmt_a_err   ~ "continuous",
      cog_tmt_b       ~ "continuous",
      cog_tmt_b_corr  ~ "continuous",
      cog_tmt_b_err   ~ "continuous"
    ),
    statistic = list(
      all_continuous()  ~ "{median} ({p25}, {p75})",
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  add_overall() %>%
  add_p(
    test = list(
      all_continuous()  ~ "kruskal.test",
      all_categorical() ~ "chisq.test"
    )
  ) %>%
  bold_labels()

tabla1
Characteristic Overall
N = 1,106
1
AD
N = 409
1
CN
N = 494
1
FTD
N = 203
1
p-value2
Años de educación 13.0 (6.0, 16.0) 12.0 (8.0, 16.0) 12.0 (5.0, 17.0) 14.0 (11.0, 16.0) <0.001
Digits forward total 5.00 (4.00, 7.00) 5.00 (3.00, 6.00) 5.00 (4.00, 7.00) 5.00 (3.00, 6.00) <0.001
    No realizado 10 5 0 5
Digits forward span 5.00 (4.00, 6.00) 5.00 (4.00, 6.00) 5.00 (4.00, 6.00) 5.00 (4.00, 6.00) 0.001
    No realizado 7 4 0 3
Digits backwards total 4.00 (3.00, 5.00) 3.00 (2.00, 4.00) 4.00 (3.00, 6.00) 3.00 (2.00, 5.00) <0.001
    No realizado 25 7 6 12
Digits backwards span 3.00 (3.00, 4.00) 3.00 (2.00, 4.00) 4.00 (3.00, 5.00) 3.00 (2.00, 4.00) <0.001
    No realizado 7 4 0 3
Mini mental 24.0 (20.0, 28.0) 21.0 (18.0, 24.0) 28.0 (24.0, 29.0) 23.0 (17.0, 26.0) <0.001
    No realizado 3 2 1 0
Fluidez semántica (animales) 14 (10, 20) 11 (8, 14) 19 (15, 23) 9 (5, 15) <0.001
    No realizado 6 3 0 3
Fluidez semántica (vegetales) 11 (6, 15) 7 (4, 10) 15 (12, 19) 6 (3, 10) <0.001
    No realizado 19 8 1 10
Atención sostenida (tiempo) 65 (40, 107) 88 (56, 147) 49 (33, 79) 76 (46, 122) <0.001
    No realizado 64 36 4 24
Atención sostenida (correctas) 24.0 (24.0, 24.0) 24.0 (24.0, 24.0) 24.0 (24.0, 24.0) 24.0 (24.0, 24.0) <0.001
    No realizado 26 17 0 9
Atención sostenida (errores) 0.00 (0.00, 0.00) 0.00 (0.00, 0.00) 0.00 (0.00, 0.00) 0.00 (0.00, 1.00) 0.033
    No realizado 26 17 0 9
Funciones ejecutivas (tiempo) 168 (92, 300) 300 (164, 300) 110 (71, 208) 195 (126, 300) <0.001
    No realizado 285 136 83 66
Funciones ejecutivas (correctas) 24 (23, 24) 24 (13, 25) 24 (24, 24) 24 (22, 25) <0.001
    No realizado 53 35 1 17
Funciones ejecutivas (errores) 0.00 (0.00, 2.00) 1.00 (0.00, 3.00) 0.00 (0.00, 1.00) 0.00 (0.00, 2.00) <0.001
    No realizado 54 35 2 17
1 Median (Q1, Q3)
2 Kruskal-Wallis rank sum test

7 Tabla 1 considerando AD vs FTD

data2 <- data %>%
  filter(clinical_diagnosis != "CN") 
data2$clinical_diagnosis <- droplevels(data2$clinical_diagnosis)

tabla1 <- 
  tbl_summary(
    data = data2,
    by = clinical_diagnosis,          # para comparar grupos
    include = c(                      # 👈 incluimos solo estas variables
      cog_ed,
      cog_digits_forward_total,
      cog_digits_forward_span,
      cog_digits_backward_total,
      cog_digits_backward_span,
      mmse_total,
      cog_category_animals,
      cog_category_vegetables,
      cog_tmt_a,
      cog_tmt_a_corr,
      cog_tmt_a_err,
      cog_tmt_b,
      cog_tmt_b_corr,
      cog_tmt_b_err
    ),
    missing = "ifany",
    missing_text = "No realizado",
    label = list(
      cog_ed ~ "Años de educación",
      cog_digits_forward_total  ~ "Digits forward total",
      cog_digits_forward_span   ~ "Digits forward span",
      cog_digits_backward_total ~ "Digits backwards total",
      cog_digits_backward_span  ~ "Digits backwards span",
      mmse_total                ~ "Mini mental",
      cog_category_animals      ~ "Fluidez semántica (animales)",
      cog_category_vegetables   ~ "Fluidez semántica (vegetales)",
      cog_tmt_a       ~ "Atención sostenida (tiempo)",
      cog_tmt_a_corr  ~ "Atención sostenida (correctas)",
      cog_tmt_a_err   ~ "Atención sostenida (errores)",
      cog_tmt_b       ~ "Funciones ejecutivas (tiempo)",
      cog_tmt_b_corr  ~ "Funciones ejecutivas (correctas)",
      cog_tmt_b_err   ~ "Funciones ejecutivas (errores)"
    ),
    type = list(                     # forzamos todas como continuas
      cog_ed ~ "continuous",
      cog_digits_forward_total  ~ "continuous",
      cog_digits_forward_span   ~ "continuous",
      cog_digits_backward_total ~ "continuous",
      cog_digits_backward_span  ~ "continuous",
      mmse_total                ~ "continuous",
      cog_category_animals      ~ "continuous",
      cog_category_vegetables   ~ "continuous",
      cog_tmt_a       ~ "continuous",
      cog_tmt_a_corr  ~ "continuous",
      cog_tmt_a_err   ~ "continuous",
      cog_tmt_b       ~ "continuous",
      cog_tmt_b_corr  ~ "continuous",
      cog_tmt_b_err   ~ "continuous"
    ),
    statistic = list(
      all_continuous()  ~ "{median} ({p25}, {p75})",
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  add_overall() %>%
  add_p(
    test = list(
      all_continuous()  ~ "wilcox.test",
      all_categorical() ~ "chisq.test"
    )
  ) %>%
  bold_labels()

tabla1
Characteristic Overall
N = 612
1
AD
N = 409
1
FTD
N = 203
1
p-value2
Años de educación 13.5 (10.0, 16.0) 12.0 (8.0, 16.0) 14.0 (11.0, 16.0) 0.006
Digits forward total 5.00 (3.00, 6.00) 5.00 (3.00, 6.00) 5.00 (3.00, 6.00) >0.9
    No realizado 10 5 5
Digits forward span 5.00 (4.00, 6.00) 5.00 (4.00, 6.00) 5.00 (4.00, 6.00) 0.6
    No realizado 7 4 3
Digits backwards total 3.00 (2.00, 4.00) 3.00 (2.00, 4.00) 3.00 (2.00, 5.00) >0.9
    No realizado 19 7 12
Digits backwards span 3.00 (2.00, 4.00) 3.00 (2.00, 4.00) 3.00 (2.00, 4.00) 0.3
    No realizado 7 4 3
Mini mental 21.0 (18.0, 25.0) 21.0 (18.0, 24.0) 23.0 (17.0, 26.0) 0.004
    No realizado 2 2 0
Fluidez semántica (animales) 10.0 (7.0, 14.0) 11.0 (8.0, 14.0) 9.0 (5.0, 15.0) 0.005
    No realizado 6 3 3
Fluidez semántica (vegetales) 6.0 (4.0, 10.0) 7.0 (4.0, 10.0) 6.0 (3.0, 10.0) 0.066
    No realizado 18 8 10
Atención sostenida (tiempo) 84 (51, 137) 88 (56, 147) 76 (46, 122) 0.008
    No realizado 60 36 24
Atención sostenida (correctas) 24.0 (24.0, 24.0) 24.0 (24.0, 24.0) 24.0 (24.0, 24.0) 0.3
    No realizado 26 17 9
Atención sostenida (errores) 0.00 (0.00, 1.00) 0.00 (0.00, 0.00) 0.00 (0.00, 1.00) 0.5
    No realizado 26 17 9
Funciones ejecutivas (tiempo) 269 (147, 300) 300 (164, 300) 195 (126, 300) <0.001
    No realizado 202 136 66
Funciones ejecutivas (correctas) 24 (14, 25) 24 (13, 25) 24 (22, 25) 0.031
    No realizado 52 35 17
Funciones ejecutivas (errores) 1.00 (0.00, 3.00) 1.00 (0.00, 3.00) 0.00 (0.00, 2.00) 0.030
    No realizado 52 35 17
1 Median (Q1, Q3)
2 Wilcoxon rank sum test

8 Función auxiliar para correlaciones

plot_corr <- function(df, vars_to_keep, label_map, title = NULL) {
  
  # Filter only numeric columns listed in vars_to_keep
  data_num <- df %>% 
    select(all_of(vars_to_keep))
  
  # Compute correlation matrix
  mat_cor <- cor(data_num, use = "pairwise.complete.obs", method = "pearson")
  
  # Apply labels
  mat_cor_labeled <- mat_cor
  colnames(mat_cor_labeled) <- label_map[colnames(mat_cor_labeled)]
  rownames(mat_cor_labeled) <- label_map[rownames(mat_cor_labeled)]
  
  # Palette
  paleta <- colorRampPalette(RColorBrewer::brewer.pal(11, "RdBu"))(100)
  
  # Plot
  corrplot(mat_cor_labeled,
           method = "color",
           type = "upper",
           order = "hclust",
           tl.cex = 0.8,
           col = paleta,
           title = title,
           mar = c(0,0,2,0))
}

9 Gráfico de correlaciones

data_num <- data %>% 
  select(where(is.numeric))

mat_cor <- cor(data_num, use = "pairwise.complete.obs", method = "pearson")

label_map <- c(
  mmse_total                = "MMSE",
  cog_digits_forward_total  = "DF total",
  cog_digits_backward_total = "DB total",
  cog_digits_forward_span   = "DF span",
  cog_digits_backward_span  = "DB span",
  cog_category_animals      = "Fluidez animales",
  cog_category_vegetables   = "Fluidez vegetales",
  cog_tmt_a       = "TMT-A (tiempo)",
  cog_tmt_a_corr  = "TMT-A (corr.)",
  cog_tmt_a_err   = "TMT-A (errores)",
  cog_tmt_b       = "TMT-B (tiempo)",
  cog_tmt_b_corr  = "TMT-B (corr.)",
  cog_tmt_b_err   = "TMT-B (errores)"
)

vars_to_keep <- names(label_map)

plot_corr(data,  vars_to_keep, label_map, title = "Correlaciones")

10 Gráfico de correlaciones (simplificado)

data_num <- data %>% 
  select(where(is.numeric))

mat_cor <- cor(data_num, use = "pairwise.complete.obs", method = "pearson")

label_map <- c(
  mmse_total                = "Desempeño Total (MMSE)",
  cog_digits_forward_total  = "Digits Forward",
  cog_digits_backward_total = "Digits Backward",
  cog_category_animals      = "Fluidez animales",
  cog_category_vegetables   = "Fluidez vegetales",
  cog_tmt_a       = "Atención sostenida",
  cog_tmt_b       = "Funciones Ejecutivas"
)


vars_to_keep <- names(label_map)

plot_corr(data,  vars_to_keep, label_map, title = "Correlaciones")

label_map <- c(
  mmse_total                = "Desempeño Total (MMSE)",
  cog_digits_forward_total  = "Digits Forward",
  cog_digits_backward_total = "Digits Backward",
  cog_category_animals      = "Fluidez animales",
  cog_category_vegetables   = "Fluidez vegetales",
  cog_tmt_a       = "Atención sostenida",
  cog_tmt_b       = "Funciones Ejecutivas"
)

vars_to_keep <- names(label_map)

plot_corr(dataAD,  vars_to_keep, label_map, title = "Correlaciones - AD")

plot_corr(dataCN,  vars_to_keep, label_map, title = "Correlaciones - CN")

plot_corr(dataFTD, vars_to_keep, label_map, title = "Correlaciones - FTD")

label_map <- c(
  mmse_total                = "Desempeño Total (MMSE)",
  cog_digits_forward_total  = "Memoria",
  cog_category_animals      = "Lenguaje",
  cog_tmt_b       = "Funciones Ejecutivas - Atención"
)

vars_to_keep <- names(label_map)

plot_corr(dataAD,  vars_to_keep, label_map, title = "Correlaciones - AD")

plot_corr(dataCN,  vars_to_keep, label_map, title = "Correlaciones - CN")

plot_corr(dataFTD, vars_to_keep, label_map, title = "Correlaciones - FTD")

# Columnas y labels
label_map <- c(
  mmse_total                = "Desempeño Total (MMSE)",
  cog_digits_forward_total  = "Memoria",
  cog_category_animals      = "Lenguaje",
  cog_tmt_a                 = "Atención",
  cog_tmt_b                 = "Funciones Ejecutivas - Atención"
)

vars_to_keep <- names(label_map)

# Graficar correlaciones originales
plot_corr(dataAD, vars_to_keep, label_map, title = "Correlaciones - AD (original)")

# Número de permutaciones a mostrar
n_perm_show <- 5
set.seed(123) # para reproducibilidad

# Elegimos la columna a permutar
column_to_permute <- "cog_tmt_a"

# Generamos 5 permutaciones diferentes
for (i in 1:n_perm_show) {
  data_perm <- dataAD
  data_perm[[column_to_permute]] <- sample(data_perm[[column_to_permute]])
  
  # Graficar correlación de esta permutación
  plot_corr(data_perm, vars_to_keep, label_map, 
            title = paste("Correlaciones - AD (perm", i, ")"))
}

# Columnas y labels
label_map <- c(
  mmse_total                = "Desempeño Total (MMSE)",
  cog_digits_forward_total  = "Memoria",
  cog_category_animals      = "Lenguaje",
  cog_tmt_a                 = "Atención",
  cog_tmt_b                 = "Funciones Ejecutivas - Atención"
)

vars_to_keep <- names(label_map)

# Correlación original
plot_corr(dataAD, vars_to_keep, label_map,
          title = "Correlaciones - AD (original)")

# Número de permutaciones a mostrar por par
n_perm_show <- 5
set.seed(123)

# Todas las combinaciones de pares
pairs <- combn(vars_to_keep, 2, simplify = FALSE)

# Loop por cada par de columnas
for (pair in pairs) {
  
  col_perm <- pair[1]
  col_ref  <- pair[2]
  
  for (i in seq_len(n_perm_show)) {
    
    data_perm <- dataAD
    data_perm[[col_perm]] <- sample(data_perm[[col_perm]])
    
    plot_corr(
      data_perm,
      vars_to_keep,
      label_map,
      title = paste0(
        "AD | permutando: ", label_map[col_perm],
        " vs ", label_map[col_ref],
        " (perm ", i, ")"
      )
    )
  }
}

#Limpio la base de las variables de digits span

#todos los datos
data <- data %>%
  select(-cog_digits_backward_span, -cog_digits_forward_span, -tests_no_hechos)

#solamente AD y FTD
data2 <- data %>%
  filter(clinical_diagnosis != "CN") 
data2$clinical_diagnosis <- droplevels(data2$clinical_diagnosis)
vars <- colnames(data)
vars <- vars[2:13]
#Cambio los nombres de las variables
label_map <- c(
  cog_ed                   = "Nivel educativo",
  cog_digits_forward_total = "Memoria inmediata",
  cog_digits_backward_total= "Memoria de trabajo",
  mmse_total               = "MMSE",
  cog_category_animals     = "Fluidez (animales)",
  cog_category_vegetables  = "Fluidez (vegetales)",
  cog_tmt_a                = "Atención",
  cog_tmt_b                = "Funciones ejecutivas",
  cog_tmt_a_err            = "Errores de atención",
  cog_tmt_b_err            = "Errores ejecutivos",
  cog_tmt_a_corr           = "Correctas atención",
  cog_tmt_b_corr           = "Correctas ejecutivas"
)

vars <- label_map[vars]
vars_original <- names(label_map)


data <- data%>%
  rename_with(
    ~ ifelse(.x %in% names(label_map), label_map[.x], .x),
    .cols = all_of(vars_original)
  )

data2<- data2 %>%
rename_with(
    ~ ifelse(.x %in% names(label_map), label_map[.x], .x),
    .cols = all_of(vars_original)
  )

11 Correlaciones entre variables post permutaciones

#Test de correlaciones luego de las permutaciones. Permuta en cada par de variables. N=10000. Luego ordena según cuán desviado está de la media el delta de correlación (o sea cuánto)

perm_corr_test <- function(data, x, y, n_perm = 10000, method = "spearman") {
  
  r_obs <- cor(data[[x]], data[[y]], method = method, use = "complete.obs")
  
  r_perm <- replicate(n_perm, {
    cor(
      sample(data[[x]]),
      data[[y]],
      method = method,
      use = "complete.obs"
    )
  })
  
  tibble(
    var_x = x,
    var_y = y,
    r_obs = r_obs,
    r_perm_mean = mean(r_perm),
    r_perm_sd = sd(r_perm),
    delta_r = r_obs - mean(r_perm),
    p_value = (sum(abs(r_perm) >= abs(r_obs)) + 1) / (n_perm + 1),
    z_perm = (r_obs - mean(r_perm)) / sd(r_perm)

  )
}



library(purrr)

results_all <- map_dfr(
  combn(vars, 2, simplify = FALSE),
  ~ perm_corr_test(data, .x[1], .x[2], n_perm = 10000)
)

results_all <- results_all |>
  dplyr::mutate(
    p_adj = p.adjust(p_value, method = "BH")
  ) 

results_all |>
  arrange(desc(abs(z_perm)))

12 “Score” de colinealidad

#Ahora esto lo que arma es un score de fuerza de colinealidad sumando los valores absolutos de cada delta r para cada permutación. Con lo cual es un valor más general sobre colinealidad de esa variable.

colinearity_score <- function(data, x, vars, n_perm = 10000, method = "spearman") {
  
  others <- setdiff(vars, x)
  
  deltas <- sapply(others, function(y) {
    
    r_obs <- cor(data[[x]], data[[y]], method = method, use = "complete.obs")
    
    r_perm <- replicate(
      n_perm,
      cor(sample(data[[x]]), data[[y]], method = method, use = "complete.obs")
    )
    
    r_obs - mean(r_perm)
  })
  
  tibble::tibble(
    var = x,
    colinearity_strength = sum(abs(deltas)),   # fuerza total
    mean_delta = mean(abs(deltas)),             # promedio
    max_delta = max(abs(deltas)),               # peor caso
    n_links = length(deltas)                    # cuántas variables compara
  )
}

library(purrr)
library(dplyr)

colinearity_tbl <- map_dfr(
  vars,
  ~ colinearity_score(
      data  = data,
      x     = .x,
      vars  = vars,
      n_perm = 10000   # o 5000 / 10000 si querés
    )
)

colinearity_tbl %>%
  arrange(desc(colinearity_strength))

12.1 AD vs FTD

#Ahora hago lo mismo pero solamente con la data AD + FTD

results_all <- map_dfr(
  combn(vars, 2, simplify = FALSE),
  ~ perm_corr_test(data2, .x[1], .x[2], n_perm = 10000)
)

results_all <- results_all |>
  dplyr::mutate(
    p_adj = p.adjust(p_value, method = "BH")
  )

results_all |>
  arrange(desc(abs(z_perm)))
colinearity_tbl <- map_dfr(
  vars,
  ~ colinearity_score(
      data  = data2,
      x     = .x,
      vars  = vars,
      n_perm = 10000   # o 5000 / 10000 si querés
    )
)

colinearity_tbl %>%
  arrange(desc(colinearity_strength))

13 Modelo logístico basal con todas las variables

#Modelo logistico con todas las variables para ver cuales dan "significativas"
modelo_todas <- glm(clinical_diagnosis~., data2, family=binomial)
summary(modelo_todas)
## 
## Call:
## glm(formula = clinical_diagnosis ~ ., family = binomial, data = data2)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.0489005  1.4691475  -1.395 0.163131    
## `Nivel educativo`       0.0598843  0.0295263   2.028 0.042543 *  
## `Memoria inmediata`     0.0013200  0.0708769   0.019 0.985141    
## `Memoria de trabajo`   -0.0734425  0.0868618  -0.846 0.397826    
## MMSE                    0.1427961  0.0369396   3.866 0.000111 ***
## `Fluidez (animales)`   -0.0904753  0.0280834  -3.222 0.001274 ** 
## `Fluidez (vegetales)`  -0.0354267  0.0314615  -1.126 0.260151    
## Atención                0.0009477  0.0048826   0.194 0.846103    
## `Errores de atención`   0.3527964  0.1639014   2.152 0.031359 *  
## `Correctas atención`   -0.0354815  0.0386131  -0.919 0.358150    
## `Funciones ejecutivas` -0.0057870  0.0020984  -2.758 0.005818 ** 
## `Errores ejecutivos`    0.0609458  0.0419743   1.452 0.146507    
## `Correctas ejecutivas`  0.0322205  0.0227271   1.418 0.156274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 502.21  on 398  degrees of freedom
## Residual deviance: 442.07  on 386  degrees of freedom
##   (213 observations deleted due to missingness)
## AIC: 468.07
## 
## Number of Fisher Scoring iterations: 4
vif(modelo_todas)
##      `Nivel educativo`    `Memoria inmediata`   `Memoria de trabajo` 
##               1.206504               1.536875               1.769802 
##                   MMSE   `Fluidez (animales)`  `Fluidez (vegetales)` 
##               1.508477               1.913680               1.717200 
##               Atención  `Errores de atención`   `Correctas atención` 
##               2.669761               1.321394               1.447586 
## `Funciones ejecutivas`   `Errores ejecutivos` `Correctas ejecutivas` 
##               2.450613               1.400061               2.314099
#A pesar de que las variables son colineales, el VIF no es preocupantemente alto
#Hago modelo logistico con todas las variables y pero evaluando el AUC por cross validation

library(dplyr)
library(rsample)
library(purrr)
library(pROC)

#Funcion para calcular AUC de regresión logística por CV

cv_auc_logistic <- function(data, outcome, predictors, v = 10) {
  
  df <- data[, c(outcome, predictors)]
  
  formula <- as.formula(
    paste(outcome, "~", paste(sprintf("`%s`", predictors), collapse = " + "))
  )
  
  folds <- vfold_cv(
    df,
    v = v,
    strata = !!sym(outcome)
  )
  
  aucs <- purrr::map_dbl(folds$splits, function(split) {
    
    train <- rsample::analysis(split)
    test  <- rsample::assessment(split)
    
    fit <- glm(
      formula,
      data = train,
      family = binomial()
    )
    
    prob <- predict(fit, test, type = "response")
    
    pROC::roc(
      response  = test[[outcome]],
      predictor = prob,
      quiet = TRUE
    )$auc
  })
  
  tibble::tibble(
    auc_mean = mean(aucs),
    auc_sd   = sd(aucs),
    auc_min  = min(aucs),
    auc_max  = max(aucs)
  )
}
#Calculo con todas las variables, AUC medio de 0.67
set.seed(123)

auc_full <- cv_auc_logistic(
  data       = data2,
  outcome    = "clinical_diagnosis",
  predictors = vars,
  v          = 10
)

auc_full
#Grafico ese AUC "out of fold"
library(dplyr)
library(rsample)
library(purrr)
library(pROC)

cv_oof_pred <- function(data, outcome, predictors, v = 10, seed = 123) {
  
  # 🛡 1) limpiar predictores
  predictors <- predictors[!is.na(predictors)]
  predictors <- predictors[predictors != ""]
  
  # 🛡 2) backticks para nombres con espacios / tildes
  predictors_q <- sprintf("`%s`", predictors)
  
  set.seed(seed)
  
  df <- data[, c(outcome, predictors)]
  
  formula <- as.formula(
    paste(outcome, "~", paste(predictors_q, collapse = " + "))
  )
  
  folds <- rsample::vfold_cv(df, v = v, strata = !!rlang::sym(outcome))
  
  purrr::map_dfr(folds$splits, function(split) {
    
    train <- rsample::analysis(split)
    test  <- rsample::assessment(split)
    
    fit <- glm(
      formula,
      data = train,
      family = binomial()
    )
    
    prob <- predict(fit, test, type = "response")
    
    tibble::tibble(
      y = test[[outcome]],
      prob = as.numeric(prob)
    )
  })
}

oof <- cv_oof_pred(
  data2,
  "clinical_diagnosis",
  vars,
  v = 10,
  seed = 123
)

roc_oof <- roc(
  response  = oof$y,
  predictor = oof$prob,
  quiet = TRUE
)

auc_val <- auc(roc_oof)

roc_df <- tibble(
  fpr = 1 - roc_oof$specificities,
  tpr = roc_oof$sensitivities
)

library(ggplot2)

ggplot(roc_df, aes(x = fpr, y = tpr)) +
  geom_line(linewidth = 1.3, color = "#1f77b4") +
  geom_abline(
    intercept = 0, slope = 1,
    linetype = "dashed",
    color = "grey60"
  ) +
  coord_equal() +
  labs(
    title = "ROC sin descartar variables",
    subtitle = paste0("AUC = ", round(auc_val, 3)),
    x = "1 − Especificidad",
    y = "Sensibilidad"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )

## Descarte de variables por “score” de colinealidad

#Tenia este "orden" de "Fortaleza de colinealidad". Voy a ir sacando de a una y a analizar el AUC
colinearity_tbl %>%
  arrange(desc(colinearity_strength))
#Voy sacando de a una y calculando, sumo todo a una tabla


vars_reduced <- c("Atención", "Memoria de trabajo", "MMSE", "Fluidez (animales)", "Memoria inmediata", "Fluidez (vegetales)","Errores ejecutivos", "Correctas ejecutivas", "Correctas atención", "Errores de atención", "Nivel educativo")

set.seed(123)

auc_reduced <- cv_auc_logistic(
  data       = data2,
  outcome    = "clinical_diagnosis",
  predictors = vars_reduced,
  v          = 10
)

auc_reduced
vars_reduced2<- c( "Memoria de trabajo", "MMSE", "Fluidez (animales)", "Memoria inmediata", "Fluidez (vegetales)","Errores ejecutivos", "Correctas ejecutivas", "Correctas atención", "Errores de atención", "Nivel educativo")


set.seed(123)

auc_reduced2 <- cv_auc_logistic(
  data       = data2,
  outcome    = "clinical_diagnosis",
  predictors = vars_reduced2,
  v          = 10
)

auc_reduced2
vars_reduced3 <- c( "MMSE", "Fluidez (animales)", "Memoria inmediata", "Fluidez (vegetales)","Errores ejecutivos", "Correctas ejecutivas", "Correctas atención", "Errores de atención", "Nivel educativo")

set.seed(123)

auc_reduced3 <- cv_auc_logistic(
  data       = data2,
  outcome    = "clinical_diagnosis",
  predictors = vars_reduced3,
  v          = 10
)

auc_reduced3
vars_reduced4 <- c("Fluidez (animales)", "Memoria inmediata", "Fluidez (vegetales)","Errores ejecutivos", "Correctas ejecutivas", "Correctas atención", "Errores de atención", "Nivel educativo")

set.seed(123)

auc_reduced4 <- cv_auc_logistic(
  data       = data2,
  outcome    = "clinical_diagnosis",
  predictors = vars_reduced4,
  v          = 10
)

auc_reduced4
#A medida que sacamos variables por colinealidad, PERDEMOS AUC

tablaauc <- rbind(auc_reduced, auc_reduced2, auc_reduced3, auc_reduced4)
rownames(tablaauc) <- c("Sacando 1 variable", "Sacando 2 variables", "Sacando 3 variables", "Sacando 4 variables")
tablaauc

14 Regularización: Elastic Net, Ridge, Lasso

#Entonces probamos hacer Elastic Net (Lasso + Ridge, o cada una de ellas) para manejar colinealidad y seleccionar variables

library(glmnet)
library(pROC)
library(rsample)
library(purrr)
library(dplyr)

cv_auc_elastic <- function(data, outcome, predictors, alpha = 0.5, v = 10, seed = 123) {
  
  predictors <- predictors[!is.na(predictors)]  # 🛡 evita NA
  predictors_q <- sprintf("`%s`", predictors)   # 🛡 nombres con espacios
  
  set.seed(seed)
  folds <- rsample::vfold_cv(data, v = v, strata = !!rlang::sym(outcome))
  
  aucs <- purrr::map_dbl(seq_along(folds$splits), function(i) {
    set.seed(seed + i)
    
    split <- folds$splits[[i]]
    train <- rsample::analysis(split)
    test  <- rsample::assessment(split)
    
    X_train <- model.matrix(
      as.formula(paste(outcome, "~", paste(predictors_q, collapse = " + "))),
      train
    )[,-1]
    
    X_test <- model.matrix(
      as.formula(paste(outcome, "~", paste(predictors_q, collapse = " + "))),
      test
    )[,-1]
    
    y_train <- train[[outcome]]
    y_test  <- test[[outcome]]
    
    cv_fit <- glmnet::cv.glmnet(
      X_train, y_train,
      family = "binomial",
      alpha = alpha,
      nfolds = 5
    )
    
    prob <- predict(cv_fit, X_test, s = "lambda.min", type = "response")
    pROC::roc(y_test, as.vector(prob), quiet = TRUE)$auc
  })
  
  tibble::tibble(
    model = paste0("Elastic Net (alpha=", alpha, ")"),
    auc_mean = mean(aucs),
    auc_sd   = sd(aucs)
  )
}
data2_clean <- data2 %>% drop_na()

#Ridge es el mejor, porque regulariza y maneja la colinealidad pero no elimina variables

auc_enet_ridge <- cv_auc_elastic(
  data       = data2_clean,
  outcome    = "clinical_diagnosis",
  predictors = vars,
  alpha      = 0,
  v          = 10
)

auc_enet_ridge
auc_enet <- cv_auc_elastic(
  data       = data2_clean,
  outcome    = "clinical_diagnosis",
  predictors = vars,
  alpha      = 0.5,
  v          = 10
)

auc_enet
auc_enet_lasso <- cv_auc_elastic(
  data       = data2_clean,
  outcome    = "clinical_diagnosis",
  predictors = vars,
  alpha      = 1,
  v          = 10
)

auc_enet_lasso
#Esta calcula tambien los coeficientes para ver qué selecciona Lasso

cv_auc_elastic_with_coefs <- function(
  data, outcome, predictors,
  alpha = 1, v = 10, seed = 123
) {
  # 🛡 1) limpiar NAs y vacíos
  predictors <- predictors[!is.na(predictors)]
  predictors <- predictors[predictors != ""]
  
  # 🛡 2) backticks para nombres con espacios/tildes
  predictors_q <- sprintf("`%s`", predictors)
  
  set.seed(seed)
  folds <- rsample::vfold_cv(data, v = v, strata = !!rlang::sym(outcome))
  
  aucs <- numeric(length(folds$splits))
  selected_vars <- vector("list", length(folds$splits))
  
  for (i in seq_along(folds$splits)) {
    set.seed(seed + i)
    
    split <- folds$splits[[i]]
    train <- rsample::analysis(split)
    test  <- rsample::assessment(split)
    
    fml <- as.formula(paste(outcome, "~", paste(predictors_q, collapse = " + ")))
    
    X_train <- model.matrix(fml, train)[, -1, drop = FALSE]
    X_test  <- model.matrix(fml, test)[, -1, drop = FALSE]
    
    y_train <- train[[outcome]]
    y_test  <- test[[outcome]]
    
    cv_fit <- glmnet::cv.glmnet(
      X_train, y_train,
      family = "binomial",
      alpha = alpha,
      nfolds = 5
    )
    
    # --- AUC ---
    prob <- predict(cv_fit, X_test, s = "lambda.min", type = "response")
    aucs[i] <- pROC::roc(y_test, as.vector(prob), quiet = TRUE)$auc
    
    # --- VARIABLES SELECCIONADAS ---
    coefs <- coef(cv_fit, s = "lambda.min")   # ✅ método S3 correcto
    sel <- rownames(coefs)[as.numeric(coefs) != 0]
    sel <- setdiff(sel, "(Intercept)")
    
    selected_vars[[i]] <- sel
  }
  
  tibble::tibble(
    auc_mean = mean(aucs),
    auc_sd   = sd(aucs),
    selected_vars = list(unlist(selected_vars))
  )
}
res_lasso <- cv_auc_elastic_with_coefs(
  data2_clean,
  "clinical_diagnosis",
  vars,
  alpha = 1
)
#Aca se ve la importancia Lasso de cada variable
library(dplyr)

lasso_importance <- tibble(var = res_lasso$selected_vars[[1]]) %>%
  count(var) %>%
  mutate(freq = n / 10) %>%   # 10 folds
  arrange(desc(freq))

lasso_importance

15 Importancia de variables por permutación

#Ahora calculo la importancia por permutación de cada variable: de a una variable, permuto y veo cuánto cambia el AUC

perm_importance_cv <- function(
  data, outcome, predictors,
  alpha = 0, v = 10, n_perm = 20, seed = 123
) {
  # 🛡 1) limpiar predictores
  predictors <- predictors[!is.na(predictors)]
  predictors <- predictors[predictors != ""]
  
  # 🛡 2) backticks para nombres con espacios / tildes
  predictors_q <- sprintf("`%s`", predictors)
  
  set.seed(seed)
  folds <- rsample::vfold_cv(data, v = v, strata = !!rlang::sym(outcome))
  
  all_imp <- list()
  
  for (i in seq_along(folds$splits)) {
    set.seed(seed + i)
    
    split <- folds$splits[[i]]
    train <- rsample::analysis(split)
    test  <- rsample::assessment(split)
    
    fml <- as.formula(
      paste(outcome, "~", paste(predictors_q, collapse = " + "))
    )
    
    X_train <- model.matrix(fml, train)[, -1, drop = FALSE]
    X_test  <- model.matrix(fml, test)[, -1, drop = FALSE]
    
    y_train <- train[[outcome]]
    y_test  <- test[[outcome]]
    
    # --- modelo base ---
    cv_fit <- glmnet::cv.glmnet(
      X_train, y_train,
      family = "binomial",
      alpha = alpha,
      nfolds = 5
    )
    
    prob_base <- predict(
      cv_fit,
      X_test,
      s = "lambda.min",
      type = "response"
    )
    
    auc_base <- pROC::roc(
      y_test,
      as.vector(prob_base),
      quiet = TRUE
    )$auc
    
    # --- permutaciones ---
    for (var in colnames(X_test)) {
      
      auc_perm <- replicate(n_perm, {
        Xp <- X_test
        Xp[, var] <- sample(Xp[, var])
        
        prob <- predict(
          cv_fit,
          Xp,
          s = "lambda.min",
          type = "response"
        )
        
        pROC::roc(
          y_test,
          as.vector(prob),
          quiet = TRUE
        )$auc
      })
      
      all_imp[[length(all_imp) + 1]] <- tibble::tibble(
        var = var,
        delta_auc = as.numeric(auc_base - mean(auc_perm))
      )
    }
  }
  
  dplyr::bind_rows(all_imp) %>%
    dplyr::group_by(var) %>%
    dplyr::summarise(
      mean_delta_auc = mean(delta_auc),
      .groups = "drop"
    ) %>%
    dplyr::arrange(desc(mean_delta_auc))
}
#Se ve que la importancia por permutación (o sea importancia de cada variable en la capacidad predictiva del modelo) es diferente que la importancia de las variables segun Lasso
set.seed(123)

perm_imp_ridge <- perm_importance_cv(
  data       = data2_clean,
  outcome    = "clinical_diagnosis",
  predictors = vars,
  alpha      = 0,      # 🔑 Ridge
  v          = 10,     # CV externo
  n_perm     = 20,     # permutaciones por variable por fold
  seed       = 123
)

perm_imp_ridge

16 Importancia de variables: importancia por permutación vs estabilidad Lasso

library(dplyr)

# Permutation importance (Ridge)
perm_tbl <- perm_imp_ridge %>%
  rename(
    var = var,
    delta_auc = mean_delta_auc
  )

# LASSO estabilidad
lasso_tbl <- lasso_importance %>%
  rename(
    var = var,
    lasso_freq = freq
  )

# Unir todo
importance_tbl <- perm_tbl %>%
  left_join(lasso_tbl, by = "var") %>%
  mutate(
    lasso_freq = ifelse(is.na(lasso_freq), 0, lasso_freq)
  )
importance_tbl
#Si bien lasso y la importancia por permutaciones coinciden en varias variables, en otras no tanto. Veamos el grafico

library(ggplot2)
library(ggrepel)

ggplot(
  importance_tbl,
  aes(x = lasso_freq, y = delta_auc, label = var)
) +
  geom_point(size = 3, alpha = 0.8) +
  geom_text_repel(
    size = 4,
    max.overlaps = Inf,
    box.padding = 0.5,
    point.padding = 0.3,
    min.segment.length = 0
  ) +
  geom_vline(xintercept = 0.5, linetype = 3, color = "grey50") +
  geom_hline(yintercept = median(importance_tbl$delta_auc, na.rm = TRUE),
             linetype = 3, color = "grey50") +
  scale_x_continuous(
    limits = c(0, 1.05),
    breaks = c(0, 0.25, 0.5, 0.75, 1)
  ) +
  labs(
    x = "Estabilidad LASSO (frecuencia de selección)",
    y = "Importancia predictiva (ΔAUC permutado)",
    title = "Importancia de variables: selección vs contribución predictiva",
    subtitle = "Comparación entre estabilidad bajo LASSO y pérdida de AUC al permutar"
  ) +
  theme_minimal(base_size = 14)

17 Modelo logístico con regularización Ridge y reducido a 4 variables

#Entonces tengo: Variables imprescindibles, arriba a la derecha; variables prescindibles en todos los sentidos, abajo a la izquierda
#Pruebo sacar las variables prescindibles por AUC y vuelvo a calcular AUC: MEJORA un poco! a 0.696

vars2 <- c("MMSE", "Fluidez (animales)", "Funciones ejecutivas", "Nivel educativo")
auc_enet_ridge <- cv_auc_elastic(
  data       = data2_clean,
  outcome    = "clinical_diagnosis",
  predictors = vars2,
  alpha      = 0,
  v          = 10
)

auc_enet_ridge
#Grafico curva ROC

oof <- cv_oof_pred(
  data2_clean,
  "clinical_diagnosis",
  vars2,
  v = 10,
  seed = 123
)

roc_oof <- roc(
  response  = oof$y,
  predictor = oof$prob,
  quiet = TRUE
)

auc_val <- auc(roc_oof)

roc_df <- tibble(
  fpr = 1 - roc_oof$specificities,
  tpr = roc_oof$sensitivities
)

library(ggplot2)

ggplot(roc_df, aes(x = fpr, y = tpr)) +
  geom_line(linewidth = 1.3, color = "#1f77b4") +
  geom_abline(
    intercept = 0, slope = 1,
    linetype = "dashed",
    color = "grey60"
  ) +
  coord_equal() +
  labs(
    title = "ROC descartando variables",
    subtitle = paste0("AUC = ", round(auc_val, 3)),
    x = "1 − Especificidad",
    y = "Sensibilidad"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )